# -*- coding: utf-8 -*-
"""
Created on Thu Aug 12 15:09:50 2021

@author: Administrator
"""

import numpy as np
import pandas as pd
from pylab import *
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split#切割数据集
from sklearn.ensemble import RandomForestRegressor #随机森林回归
from sklearn.metrics import mean_squared_error #mse评价指标
from sklearn.model_selection import KFold#交叉验证
from sklearn.model_selection import GridSearchCV#网格搜索

#1.读取数据
train_data = pd.read_csv(r'C:\Users\Administrator\Desktop\工业蒸汽量预测\zhengqi_train.txt',sep='\t',encoding='utf-8')
test_data = pd.read_csv(r'C:\Users\Administrator\Desktop\工业蒸汽量预测\zhengqi_test.txt',sep='\t',encoding='utf-8')
column = train_data.columns.tolist()[:39] #列表头

#删除训练集和测试集分布不一致的特征 
train_data = train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
test_data = test_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)

#2.对数据进行归一化处理
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform((train_data[features_columns]))
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target']=train_data['target']
column = train_data_scaler.columns.tolist()[:33] #列表头

#3.使用PCA去除共线性
pca = PCA(n_components=0.9)
new_train_pca_90 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_90 = pca.transform(test_data_scaler)
new_train_pca_90 = pd.DataFrame(new_train_pca_90)
new_test_pca_90 = pd.DataFrame(new_test_pca_90)
new_train_pca_90['target']=train_data_scaler['target']
new_train_pca_90.describe()

train_pca = new_train_pca_90.fillna(0) #采用PCA保留16维特征数据
train = new_train_pca_90[new_test_pca_90.columns]
target = new_train_pca_90['target']

#随机森林模型预测流程
'''
下面给出对数据建模、五折交叉验证、划分数据、对随机模型进行训练、
计算MSE评价性能等流程
'''
#针对随机森林进行调参 用网格搜索确定控制基评估器的参数
RF = RandomForestRegressor()
# #定义网格搜索的参数
parameters = {'n_estimators':[50,100,200],'max_features':np.arange(1,3,1)#最大特征数，默认是特征数开平方
,"max_depth":np.arange(1,10,1)#树最大深度  超过最大深度就会被剪枝
,'min_samples_split':np.arange(2, 10, 1)#一个节点在分支后的每个子节点都必须包含至少min_samples_split个训练样本，这个节点才允许被分支，否则分支不会发生
,'min_samples_leaf':np.arange(1, 10, 1)#一个节点在分支后的每个子节点都必须包含至少min_samples_leaf个训练样本，否则分支不会发生
}
GS = GridSearchCV(RF, parameters,cv=5)
GS.fit(train,target)
GS.best_params_  
'''
获得最佳参数max_depth=9，max_features=2，min_samples_leaf=1
min_samples_split=4，n_estimators=200
'''
#5折交叉验证
Folds =5
kf = KFold(n_splits=Folds,shuffle=True,random_state=20)
#记录训练和预测MSE
MSE_DICT = {'train_mse':[],'test_mse':[]}
#训练预测
for i,(train_index,test_index) in enumerate(kf.split(train)):
    X_train_KFold,X_test_KFold,y_train_KFold,y_test_KFold = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    #随机森林模型
    clf_RF =RandomForestRegressor(n_estimators=200,
                                   max_depth=9,
                                   max_features=2,
                                   min_samples_leaf=1,
                                   min_samples_split=4)
    #训练模型
    clf_RF.fit(X=X_train_KFold,y=y_train_KFold)
    #训练集和测试集预测
    #训练集预测结果
    y_train_KFold_predict=clf_RF.predict(X_train_KFold)
    #测试集预测结果
    y_test_KFold_predict=clf_RF.predict(X_test_KFold)
    
    print('第{k}折 训练和预测  训练MSE  预测MSE'.format(k=i+1))
    train_mse = mean_squared_error(y_train_KFold_predict,y_train_KFold)
    print('---------\n','训练MSE\n',train_mse,'\n---------')
    test_mse = mean_squared_error(y_test_KFold_predict,y_test_KFold)
    print('---------\n','预测MSE\n',test_mse,'\n---------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('---------\n','训练MSE\n',MSE_DICT['train_mse'],'\n',
      np.mean(MSE_DICT['train_mse']),'\n---------')
print('---------\n','预测MSE\n',MSE_DICT['test_mse'],'\n',
      np.mean(MSE_DICT['test_mse']),'\n---------\n')    
